R Notebook, showing the functionality of the scMoC for clustering multi-omics data specifically for scRNA-seq data and scATAC-seq data for paired datasets (i.e. measured for exact same cell). This notebook used sci-CAR data for Mouse kideny as example to work on. Other notebooks shows the same flow on different datasets.
We start with loading the scMoC functions as well as libraries needed to process this notebook
# Load dependencies
if (!require("pacman")) install.packages("pacman")
Loading required package: pacman
pacman::p_load(Seurat,dplyr,patchwork,repr,Matrix,qdapTools,FNN,
ggplot2,gplots,broman,BiocManager,limma,mclust,
cluster,OmicsMarkeR,abind,corrplot,svglite,EnhancedVolcano)
source("R/scmoc.R")
marker_genes <- read.csv("R/marker_genes_list.csv",header = FALSE,stringsAsFactors=FALSE)
white <- brocolors("crayons")["White"]
blue <- brocolors("crayons")["Cerulean"]
my_palette<- colorRampPalette(c(white,blue))(n=100)
Some globale variable definitions
# sci-CAR Variables
MinCellsGenes = 3
MinFeat = 200
MinCellsPeaks = 3
N_PCs = 20
N_Neighbours = 50
Max_N_PCs = 20
impute_NN = 50
N_PC = 20
N_LSI = 20
N_genes = 25
N_peaks = 15
RNA_clusters_res = 0.8
ATAC_clusters_res = 0.8
Read the sci-CAR data (The files can be downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089)
##------------- Reading Raw Files -------------------------------------------------------
print ("Reading RNA files", quote=FALSE)
[1] Reading RNA files
rna_count_data <- read.csv("GSM3271044_RNA_mouse_kidney_gene_count.txt",header = FALSE,skip = 2,sep = " ")
print (c("RNA Files -> Count data: ",nrow(rna_count_data)), quote=FALSE)
[1] RNA Files -> Count data: 7036187
rna_cell_data <- read.csv("GSM3271044_RNA_mouse_kidney_cell.txt", header=TRUE)
print (c("RNA Files -> Count data: ",nrow(rna_cell_data)), quote=FALSE)
[1] RNA Files -> Count data: 13893
rna_gene_data <- read.csv("GSM3271044_RNA_mouse_kidney_gene.txt",header = TRUE)
print (c("RNA Files -> Count data: ",nrow(rna_gene_data)), quote=FALSE)
[1] RNA Files -> Count data: 49584
tmp_rna <- matrix(0L, nrow = nrow(rna_gene_data),ncol = nrow(rna_cell_data))
print(c("Creating Zero Matrix with dim : ", dim(tmp_rna)),quote = FALSE)
[1] Creating Zero Matrix with dim : 49584
[3] 13893
for (i in 1:nrow(rna_count_data)){
tmp_rna[rna_count_data[i,1],rna_count_data[i,2]] = rna_count_data[i,3]
}
print(c("RNA Files -> Data Sparsity = ", round(
nrow(rna_count_data)*100/(nrow(rna_cell_data) * nrow(rna_gene_data)),2),"%"),quote=FALSE)
[1] RNA Files -> Data Sparsity = 1.02
[3] %
rna_sparce_mat <- Matrix(tmp_rna, sparse = TRUE)
colnames(rna_sparce_mat) <- rna_cell_data[,1]
rownames(rna_sparce_mat) <- rna_gene_data[,3]
## Reading ATAC files
print ("Reading ATAC files", quote=FALSE)
[1] Reading ATAC files
atac_count_data <- read.csv("GSM3271045_ATAC_mouse_kidney_peak_count.txt",header = FALSE,skip = 2,sep = " ")
print (c("ATAC Files -> Count data: ",nrow(atac_count_data)), quote=FALSE)
[1] ATAC Files -> Count data: 9448526
atac_cell_data <- read.csv("GSM3271045_ATAC_mouse_kidney_cell.txt", header=TRUE)
print (c("ATAC Files -> Count data: ",nrow(atac_cell_data)), quote=FALSE)
[1] ATAC Files -> Count data: 13395
atac_gene_data <- read.csv("GSM3271045_ATAC_mouse_kidney_peak.txt",header = TRUE)
print (c("ATAC Files -> Count data: ",nrow(atac_gene_data)), quote=FALSE)
[1] ATAC Files -> Count data: 252741
tmp_atac <- matrix(0L, nrow = nrow(atac_gene_data),ncol = nrow(atac_cell_data))
print(c("Creating Zero Matrix with dim : ", dim(tmp_atac)),quote = FALSE)
[1] Creating Zero Matrix with dim : 252741
[3] 13395
for (i in 1:nrow(atac_count_data)){
tmp_atac[atac_count_data[i,1],atac_count_data[i,2]] = atac_count_data[i,3]
}
print(c("ATAC Files -> Data Sparsity = ", round(
nrow(atac_count_data)*100/(nrow(atac_cell_data) * nrow(atac_gene_data)),2),"%"),quote=FALSE)
Warning in nrow(atac_cell_data) * nrow(atac_gene_data): NAs produced by
integer overflow
[1] ATAC Files -> Data Sparsity = <NA>
[3] %
atac_sparce_mat <- Matrix(tmp_atac, sparse = TRUE)
colnames(atac_sparce_mat) <- atac_cell_data[,1]
rownames(atac_sparce_mat) <- atac_gene_data[,3]
## Find The Common Cells and Create Seurat Objects
# Finding the Common Cells
common_cells <-intersect(colnames(rna_sparce_mat), colnames(atac_sparce_mat))
# Creat Seuart Object for RNA
rna <- CreateSeuratObject(count=rna_sparce_mat[,common_cells], project = "sciCAR",
min.cells = MinCellsGenes, min.features = MinFeat)
Warning: Non-unique features (rownames) present in the input matrix, making
unique
Warning: Feature names cannot have underscores ('_'), replacing with dashes
('-')
# Creat Seurat Object for ATAC
atac <- CreateSeuratObject(count=atac_sparce_mat[,common_cells], project = "sciCAR",
min.cells = MinCellsPeaks)
Warning: Non-unique features (rownames) present in the input matrix, making
unique
Warning: Feature names cannot have underscores ('_'), replacing with dashes
('-')
Visualize QC metrics as a violin plot for RNA data
#Calculate the percentage of Mitrocondrial genes in the cell
rna[["percent.mt"]] <- PercentageFeatureSet(rna, pattern = "^mt-")
# Visualize QC metrics as a violin plot for RNA data
VlnPlot(rna, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(rna, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(rna, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2
Visualize QC metrics as a violin plot for ATAC data
# Visualize QC metrics as a violin plot
VlnPlot(atac, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
According to the figures we set the thresholds of the data as follows :
First for RNA:
1- Remove cells that contains more than 30% Mito Data
2- Remove cells that have less than 200 different genes [almost empty cells]
3- Remove cells that have more than 2500 different genes[noisy cells]
rna <- subset(rna, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 30)
Second for ATAC
1- Remove cells that have less than 4000 different peaks [almost empty cells]
2- Remove cells that have more than 10000 different peaks[noisy cells]
atac <- subset(atac, subset = nFeature_RNA < 4000 & nCount_RNA < 10000 )
Select only common cells that passes both QC task
common_cells <-intersect(colnames(rna), colnames(atac))
rna <- rna[,common_cells]
atac <- atac[,common_cells]
# 1- Normailize
print("RNA Data: Making Normailization")
[1] "RNA Data: Making Normailization"
rna <- NormalizeData(rna, normalization.method = "LogNormalize", scale.factor = 10000)
# 2- Select Top variable genes
print("RNA Data: Finding Top Variable Genes")
[1] "RNA Data: Finding Top Variable Genes"
rna <- FindVariableFeatures(rna, selection.method = "vst", nfeatures = 6000)
# 3- Do the processing
print("RNA Data: Going to Processing function ")
[1] "RNA Data: Going to Processing function "
rna <- process.object(rna,do_scaling = TRUE)
[1] "Processing Function: Scaling Data"
Centering and scaling data matrix
# 4- Do PCA on the highly Variable genes only
print("RNA Data: Going to Processing function ")
[1] "RNA Data: Going to Processing function "
rna <- RunPCA(rna, features = VariableFeatures(object = rna))
PC_ 1
Positive: Fut9, Ghr, Keg1, Cyp4b1, Gm19950, Gk, RP23-306P12.3, Slco1a6, Bnc2, Slc13a1
Cyp2j13, Gramd1b, St8sia1, Slc7a13, Ndrg1, Phyhipl, Slc13a3, Slc16a2, Ces1f, Kcnk5
Slc18a1, Fgf1, Hsd3b2, Pecr, Sox2ot, Ctnna2, Cd36, Nr1h4, Zeb2, Gss
Negative: Mecom, Erbb4, Slit2, Slc16a7, Cacnb4, Rgs6, Phactr1, Egf, Nr3c2, Abca13
Prdm16, Slc12a1, Slc8a1, Stox2, Nckap5, Efna5, Stk39, Pde7b, Slc5a3, Slc12a3
Rap1gap2, Tmem117, Kng2, Utrn, Frmpd4, Maml3, Ipcef1, Tox3, Nbea, Tacc1
PC_ 2
Positive: Slc12a1, mt-Rnr1, mt-Rnr2, Slc5a12, Egf, Magi2, Erbb4, Umod, Cacnb4, Abca13
Gatm, Gabrb3, Sgcz, Ndrg1, Ldb2, Bnc2, Meis2, Tenm2, Slc7a7, Alpl
Ptger3, Rn18s-rs5, Tshz2, Fbxl7, Slc2a13, Ebf1, Egfl6, Tmem207, Dlc1, Wipf1
Negative: Slc4a9, Slc26a4, Atp6v0d2, Lsamp, Pde4b, Gm12121, Atp6v1c2, Syn2, Tmem117, Rcan2
Insrr, Alcam, Tmem163, Plcg2, Rp1, Thsd7a, Bmpr1b, Ralgapa2, Ccbe1, Zcchc16
Clnk, Malrd1, Prkca, Naaladl2, Irs1, Nbea, Plxna4, Hepacam2, Sh2d4b, Fam65b
PC_ 3
Positive: Cyp7b1, Slc7a13, Mep1b, Aadat, Esr1, Chrm3, Rnf24, Gm43158, Gpm6a, Fgf1
C8a, Crot, Myo5a, Cd36, Slc6a18, Bcat1, Satb2, Slc22a7, Mep1a, Slc22a19
Ppm1e, Gramd1b, Wdr17, Acox2, Gclc, Gm853, A330033J07Rik, Atp8b4, Ccdc162, Ido2
Negative: Slc5a12, Car12, Bnc2, Itpr2, Gatm, Ndrg1, Magi2, Gabrb3, Slc13a1, Slc22a23
RP23-312B17.2, Alpl, Slc7a7, Slc16a14, Slc7a8, Angpt1, Enpp2, Tmem108, Stxbp5l, D630023F18Rik
Slc4a9, Phyhipl, Maf, Cyp4b1, Folh1, Slc26a4, Slc6a19, Ccdc141, Fut9, Lipa
PC_ 4
Positive: Slc12a3, Abca13, Trpm6, Klhl3, Wnk1, Cacnb4, Tsc22d1, Sgms2, Egf, Magi2
Slc8a1, Dach1, Erbb4, Calb1, Kng2, Ccdc141, Kctd1, Cadps2, Slc16a7, Tox3
Atp1a1, Mecom, Slit2, Arnt2, Rgs6, Cyfip2, Prkd1, Efna5, Acss3, Cdk14
Negative: Slc4a9, Slc26a4, Atp6v0d2, Gm12121, mt-Rnr1, Rcan2, mt-Rnr2, Pde4b, Lsamp, Tshz2
Syn2, Insrr, Atp6v1c2, Ldb2, Meis2, Dlc1, Prkg1, Plcg2, Thsd7a, Tmem163
Ccbe1, Ebf1, Rp1, Adgrf5, Clnk, Alcam, Tmem117, Plpp1, Plxna4, Rbms3
PC_ 5
Positive: Egf, Slc4a9, Slc26a4, Abca13, Slc12a1, Atp6v0d2, Zcchc16, Slc16a7, Atp6v1c2, Gm12121
Pde4b, Insrr, Lsamp, Syn2, Umod, Tmem163, Irs1, Tmem72, Rp1, Slc12a3
Kng2, Casr, Plcg2, Rcan2, Ptger3, Malrd1, Erbb4, Cacnb4, Rap1gap2, Rgs6
Negative: Frmpd4, Rbms3, Fxyd4, Pde8b, Tmem45b, Pde1c, Samd12, Npnt, Gpc5, Rasgrf2
Phactr1, Dnm3, Mgat4c, Aqp4, Styk1, Aim1, Frem2, Col26a1, Srgap3, Stard13
Tmem150c, Bmpr1b, Atp8a1, 5730508B09Rik, Gata3, Btc, Tshz2, Ehf, Pde7b, Hacd4
# 5- Cluster the RNA data
rna <- FindNeighbors(rna, dims = 1:N_PCs,k.param=N_Neighbours)
Computing nearest neighbor graph
Computing SNN
rna <- FindClusters(rna, resolution = RNA_clusters_res,algorithm = 1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 9080
Number of edges: 843341
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8524
Number of communities: 15
Elapsed time: 2 seconds
# 6- Project to the UMAP
rna <- RunUMAP(rna, dims = 1:N_PCs)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
16:56:06 UMAP embedding parameters a = 0.9922 b = 1.112
16:56:06 Read 9080 rows and found 20 numeric columns
16:56:06 Using Annoy for neighbor search, n_neighbors = 30
16:56:06 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:56:08 Writing NN index file to temp file /tmp/RtmpcMXYtB/file20abb496e77b
16:56:08 Searching Annoy index using 1 thread, search_k = 3000
16:56:12 Annoy recall = 100%
16:56:12 Commencing smooth kNN distance calibration using 1 thread
16:56:13 Initializing from normalized Laplacian + noise
16:56:14 Commencing optimization for 500 epochs, with 392706 positive edges
16:56:49 Optimization finished
options(repr.plot.width=6 , repr.plot.height=6)
DimPlot(rna, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
# 1- Normailize
print("ATAC Data: Making Normailization")
[1] "ATAC Data: Making Normailization"
atac <- NormalizeData(atac, normalization.method = "LogNormalize", scale.factor = 10000)
# 2- Do the processing
print("ATAC Data: Going to Processing function ")
[1] "ATAC Data: Going to Processing function "
atac <- process.object(atac,do_scaling = TRUE, do_PCA = TRUE, do_LSI = TRUE)
[1] "Processing Function: Scaling Data"
Centering and scaling data matrix
[1] "Processing Function: Calculating PCA"
Warning in PrepDR(object = object, features = features, verbose = verbose):
The following 18 features requested have zero variance (running reduction
without them): chr1.17279, chr10.12112, chr14.16, chr14.95, chr2.4332,
chr2.9489, chr2.16550, chr3.106, chr5.11249, chr7.3636, chr8.5234, chrX.
2065, chrX.2190, chrX.2521, chrX.4110, chrX.4127, chrX.4975, chrY.274
PC_ 1
Positive: chrUn-JH584304.24, chr3.140, chr9.2, chr5.15319, chr9.14010, chr17.4484, chr11.13342, chrX.5319, chr15.5362, chr1.18288
chr16.5778, chr9.20, chrUn-GL456396.1, chr9.32, chr19.6629, chr9.1, chr9.13, chr18.5839, chr9.33, chrUn-JH584304.27
chr9.18, chr4.5085, chr16.809, chr9.30, chrUn-JH584304.43, chr2.279, chr9.24, chr16.3943, chr9.37, chr9.29
Negative: chr2.10376, chr2.10377, chr17.4840, chr14.1099, chr14.1100, chr19.543, chr8.8730, chr2.1871, chrX.5396, chr4.8951
chr4.13810, chrX.1983, chr2.8974, chr9.3251, chrX.4590, chr2.13651, chr15.3213, chr1.12646, chr6.10992, chrX.3013
chr15.3182, chrY.924, chr8.2765, chrUn-GL456383.9, chr5.7168, chr11.10639, chr3.11439, chrX.633, chrX.5610, chrX.6021
PC_ 2
Positive: chr2.10376, chr2.10377, chr17.4840, chr14.1099, chr14.1100, chr9.2707, chr14.1097, chrUn-GL456392.8, chr6.9903, chrUn-GL456389.5
chr4.8951, chr6.13135, chr2.7215, chr8.5002, chr2.13651, chr10.1892, chrX.5396, chr1.3638, chr17.7989, chrY.1286
chr4.6718, chr9.12920, chr1.13782, chr7.6007, chr8.5608, chr18.2000, chr1.17643, chr3.7269, chr6.13134, chr9.2805
Negative: chrUn-JH584304.24, chr7.2396, chr17.6535, chr6.12094, chr8.172, chr15.8918, chr11.12079, chr11.13430, chr2.3935, chr19.777
chr6.8341, chr7.14542, chr5.13002, chr13.3961, chr16.4566, chr2.3788, chr17.6380, chr17.4031, chr14.5832, chr11.1662
chr9.14138, chr15.7175, chr8.2200, chr16.4239, chr13.4796, chr7.1464, chr5.10024, chr14.4814, chr7.2908, chr2.4222
PC_ 3
Positive: chr2.10377, chr2.10376, chr14.1100, chr14.1099, chr14.1097, chr9.2707, chr8.8730, chr7.4817, chr13.3163, chr9.14138
chr9.37, chrUn-GL456392.8, chr7.1843, chr3.9511, chr11.6151, chr10.11564, chr5.12113, chr17.2948, chr8.12222, chr6.8341
chr7.5257, chr14.1101, chr4.6724, chr5.6789, chr8.11579, chr8.6264, chr14.4177, chrUn-JH584304.45, chr16.4566, chr15.8918
Negative: chr17.4840, chr11.13342, chr3.140, chr9.14010, chr16.5778, chr5.15319, chr16.809, chr17.4839, chr17.4484, chr6.14
chr13.4911, chr17.4841, chr18.5839, chr2.279, chr12.5319, chr13.3680, chr17.7521, chr2.1882, chr9.50, chr7.15097
chr6.4270, chr1.18288, chrUn-JH584304.24, chr11.4950, chr16.3943, chr9.12238, chr13.4910, chr4.9343, chr19.6629, chr13.9817
PC_ 4
Positive: chr17.4840, chr2.10376, chrUn-JH584304.45, chr16.5778, chr11.13342, chr5.15319, chr2.10377, chr9.2707, chr14.1098, chr14.1100
chr14.1099, chrUn-JH584304.25, chr9.14010, chr9.29, chrUn-JH584304.32, chr9.12238, chrUn-GL456389.5, chr9.20, chr3.140, chr9.50
chr14.1097, chrUn-JH584304.27, chr17.4839, chrUn-GL456392.8, chr14.1101, chrX.5319, chr9.22, chr9.17, chr9, chr16.809
Negative: chr4.4129, chr3.441, chr1.12779, chr18.4299, chr11.9229, chr15.1189, chr6.1987, chr14.8104, chr13.5298, chr11.4637
chr5.11647, chr1.1366, chr15.6494, chr4.3027, chr7.2581, chr13.8478, chr5.901, chr6.7484, chr12.2732, chr2.8239
chr4.10623, chr6.12947, chr5.6268, chr16.4955, chr13.6718, chr18.5959, chr18.1631, chr1.10079, chr13.1165, chr1.11290
PC_ 5
Positive: chr2.10377, chr2.10376, chr14.1100, chr14.1099, chr14.1097, chr9.2707, chr9.20, chr9.30, chr14.1101, chrUn-JH584304.35
chrUn-GL456392.8, chr9.17, chr14.1098, chrUn-GL456389.5, chr9.22, chrUn-JH584304.46, chr9.1, chrUn-JH584304.45, chr9, chr12.5
chrUn-JH584304.29, chr9.29, chrUn-JH584304.43, chr16.4769, chr10.3687, chr9.37, chr13.6804, chr9.19, chr2.8133, chr3.11150
Negative: chr18.1158, chr7.12444, chr8.9544, chr7.15636, chr12.1679, chr15.2983, chr9.2390, chr12.985, chr1.8651, chr17.209
chr4.8651, chr10.6488, chr9.11151, chr11.8423, chr11.1655, chr2.316, chr18.6773, chr3.763, chr5.2512, chr8.8271
chr16.3924, chr15.2399, chr1.13608, chr7.6175, chr11.14759, chr10.7206, chr4.4494, chr11.10994, chr13.1036, chr15.5491
[1] "Processing Function: Calculating LSI"
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Performing TF-IDF normalization
Running SVD on TF-IDF matrix
Scaling cell embeddings
# 3 - Cluster the ATAC data
atac <- FindNeighbors(atac, dims = 1:N_PCs,k.param=N_Neighbours,reduction ='lsi' )
Computing nearest neighbor graph
Computing SNN
atac <- FindClusters(atac, resolution = ATAC_clusters_res,algorithm = 1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 9080
Number of edges: 397754
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.7995
Number of communities: 9
Elapsed time: 1 seconds
table(atac$seurat_clusters)
0 1 2 3 4 5 6 7 8
3180 2043 1749 1023 469 203 147 142 124
#Copy the clustering output to another meta data location to avoid overwritting
atac$seurat_clusters_lsi <- atac$seurat_clusters
# 4- Project to the UMAP
atac <- RunUMAP(atac, dims = 1:N_PCs,reduction = "lsi")
17:18:51 UMAP embedding parameters a = 0.9922 b = 1.112
17:18:51 Read 9080 rows and found 20 numeric columns
17:18:51 Using Annoy for neighbor search, n_neighbors = 30
17:18:51 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
17:18:53 Writing NN index file to temp file /tmp/RtmpcMXYtB/file20abb7eab4ff6
17:18:53 Searching Annoy index using 1 thread, search_k = 3000
17:18:57 Annoy recall = 100%
17:19:00 Commencing smooth kNN distance calibration using 1 thread
17:19:01 Initializing from normalized Laplacian + noise
17:19:02 Commencing optimization for 500 epochs, with 366294 positive edges
17:19:36 Optimization finished
DimPlot(atac, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
# 5- Re-color the ATAC UMAP with the RNA cluster
atac$orig_clusters <- atac$seurat_clusters
atac@active.ident<- rna$seurat_clusters
# Plot the UMAP
DimPlot(atac, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
atac$active.ident <- atac$orig_clusters
# 1- Do imputation
pca_embeed <- Embeddings(rna)
out =knn.index(pca_embeed, k=impute_NN)
print("ATAC RNA-PCA Impute: Doing the imputation ")
[1] "ATAC RNA-PCA Impute: Doing the imputation "
atac_impute <- GetAssayData(atac) #Create zero array with the same dim. of the data
atac_data <- GetAssayData(atac)
atac_impute <- do.imputation(out,atac_data,atac_impute)
# 2 - Do processing
print("ATAC RNA-PCA Impute: Going to Processing function ")
[1] "ATAC RNA-PCA Impute: Going to Processing function "
atac_rna_guided <- CreateSeuratObject(count=atac_impute, project = "sciCAR")
atac_rna_guided<- process.object(atac_rna_guided, do_scaling = TRUE, do_PCA = TRUE, do_LSI = TRUE)
[1] "Processing Function: Scaling Data"
Centering and scaling data matrix
[1] "Processing Function: Calculating PCA"
Warning in PrepDR(object = object, features = features, verbose = verbose):
The following 18 features requested have zero variance (running reduction
without them): chr1.17279, chr10.12112, chr14.16, chr14.95, chr2.4332,
chr2.9489, chr2.16550, chr3.106, chr5.11249, chr7.3636, chr8.5234, chrX.
2065, chrX.2190, chrX.2521, chrX.4110, chrX.4127, chrX.4975, chrY.274
PC_ 1
Positive: chr3.140, chr9.14010, chr17.4840, chr13.4911, chr18.5839, chr11.13342, chr16.809, chr17.4839, chr5.15319, chr16.5778
chr17.4484, chr17.4841, chr12.5319, chr17.7521, chr6.14, chr8.1337, chr2.279, chr6.4270, chr9.12238, chr1.18288
chr7.15097, chr2.1882, chr13.3680, chr13.4910, chr4.9343, chr19.6629, chr3.4551, chr16.3943, chr19.725, chrUn-JH584304.24
Negative: chr1.11409, chr5.6789, chr5.14541, chr9.14138, chr5.13549, chr14.8731, chr3.6758, chr4.8042, chr4.8648, chr14.9056
chr11.14603, chr2.10377, chr10.11653, chr15.3442, chr1.614, chr18.1417, chr4.14813, chr1.11569, chr11.1079, chr8.6137
chr2.17548, chr13.4328, chr8.2274, chr3.1873, chr9.37, chr9.4096, chr18.3037, chr7.1715, chr10.6733, chr7.10310
PC_ 2
Positive: chr11.4950, chrUn-JH584304.24, chr17.83, chr16.2249, chr19.211, chr7.494, chr11.4783, chr19.814, chr4.10424, chr11.7159
chr3.8565, chr1.14195, chr12.7552, chr8.9485, chr7.13165, chr2.18148, chr15.7249, chr4.14566, chr1.6176, chr11.9845
chr4.15180, chr11.6170, chr2.2330, chr10.3231, chr17.7836, chr17.4377, chr7.1754, chr18.174, chr11.6657, chr9.9299
Negative: chr2.10376, chr2.10377, chr14.1099, chr14.1100, chrUn-GL456389.5, chr14.1097, chr9.2707, chr6.13135, chr8.5002, chrUn-GL456392.8
chr6.9903, chr14.1098, chr8.345, chr17.6146, chr8.10250, chr11.3486, chr4.9101, chr9.32, chr13.1563, chrUn-JH584304.32
chrUn-JH584304.43, chr2.6904, chr9.13088, chr5.13481, chr12.5, chr14.7886, chr10.779, chr5.7810, chr10.5228, chr10.6126
PC_ 3
Positive: chr7.9163, chr11.1337, chr2.13103, chr17.794, chr2.16397, chr5.8354, chr5.12536, chr13.1875, chr11.15144, chr11.3172
chr4.8042, chr1.6596, chr7.16443, chr5.3254, chr2.15173, chr2.9630, chr8.2198, chr10.1250, chr11.11456, chr5.12234
chr14.2356, chr6.908, chr15.9015, chrX.5946, chr12.7976, chr7.3173, chr12.8442, chr7.16039, chr7.10727, chr3.11521
Negative: chr2.5865, chr9.96, chrX.4239, chr6.7702, chr11.2964, chr8.9288, chr17.1871, chr6.9947, chr8.10651, chr12.9286
chr15.3324, chr16.6538, chr1.2765, chr14.6921, chr19.6344, chr7.732, chr16.5458, chr19.6470, chr1.16099, chr2.17419
chr9.8889, chrX.6369, chr3.13906, chr15.8275, chr15.9110, chr12.2302, chr11.839, chr2.8857, chr12.4174, chr7.8855
PC_ 4
Positive: chr13.4829, chrX.6243, chr10.4513, chr1.10717, chr6.4756, chr12.924, chr2.5865, chrX.4239, chr6.9947, chr12.3959
chr11.2964, chr15.3324, chr3.3724, chr1.16099, chr15.9110, chr8.9288, chr8.10651, chr10.2233, chr3.3419, chr10.3532
chr11.3816, chr17.1871, chr3.13906, chr12.9286, chr10.4602, chr9.8889, chr4.1837, chr4.5029, chr7.732, chr10.11411
Negative: chr11.6894, chr15.6798, chr3.11858, chr1.5110, chr11.701, chr14.2381, chr4.10999, chr1.16459, chr1.16271, chr6.12094
chr15.6787, chr14.5378, chr3.2518, chr8.248, chr15.1622, chr12.6124, chr7.16648, chr13.3567, chr4.11659, chr8.10625
chr14.9153, chr19.2766, chr10.1222, chr1.17836, chr4.11724, chr4.3972, chr1.6914, chr4.12829, chr2.18196, chr2.10458
PC_ 5
Positive: chr8.5761, chr17.5224, chr4.9914, chr2.7683, chr18.2614, chr3.7546, chr7.11511, chr9.12077, chr13.4310, chr1.4004
chr15.2108, chr5.4085, chr7.6825, chr7.11766, chr1.4016, chr18.4784, chr6.6348, chr2.7392, chr2.5005, chr13.5099
chr11.12380, chr11.14475, chr14.4664, chr12.2904, chr8.2208, chrX.868, chr11.1646, chr8.10782, chr17.6688, chr11.7728
Negative: chr6.2906, chr7.16242, chr16.2117, chr4.8476, chr7.15757, chr8.901, chr16.2434, chr4.13504, chr7.13385, chr15.1980
chr9.4234, chr2.11117, chr1.6209, chr14.3802, chr3.6628, chr5.4951, chr19.264, chr15.4702, chr6.14513, chr2.8037
chr13.1700, chrY.2046, chr11.367, chr3.8453, chr11.5147, chr4.4705, chr14.336, chr19.189, chr1.7622, chr18.3244
[1] "Processing Function: Calculating LSI"
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Warning: RunLSI is being moved to Signac. Equivalent functionality can be
achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more
information on Signac, please see https://github.com/timoast/Signac
Performing TF-IDF normalization
Running SVD on TF-IDF matrix
Scaling cell embeddings
# 3- Measure Data sparsity
x= sum(GetAssayData(atac_rna_guided$RNA)!=0)/ncol(atac_rna_guided)
y = 100/nrow(atac_rna_guided)
x*y
[1] 11.04943
# 4- Cluster the RNA-guided imputed ATAC data
atac_rna_guided <- FindNeighbors(atac_rna_guided, dims = 1:N_PCs,k.param=N_Neighbours,reduction ='lsi' )
Computing nearest neighbor graph
Computing SNN
atac_rna_guided <- FindClusters(atac_rna_guided, resolution = ATAC_clusters_res,algorithm = 1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 9080
Number of edges: 563136
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9106
Number of communities: 20
Elapsed time: 1 seconds
#Copy the clustering output to another meta data location to avoid overwritting
atac_rna_guided$seurat_clusters_lsi <- atac_rna_guided$seurat_clusters
atac_rna_guided <- RunUMAP(atac_rna_guided, dims = 1:N_PCs,reduction = "lsi")
18:13:40 UMAP embedding parameters a = 0.9922 b = 1.112
18:13:40 Read 9080 rows and found 20 numeric columns
18:13:40 Using Annoy for neighbor search, n_neighbors = 30
18:13:40 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
18:13:49 Writing NN index file to temp file /tmp/RtmpcMXYtB/file20abbd0d26a6
18:13:49 Searching Annoy index using 1 thread, search_k = 3000
18:13:53 Annoy recall = 100%
18:13:53 Commencing smooth kNN distance calibration using 1 thread
18:13:54 Initializing from normalized Laplacian + noise
18:13:55 Commencing optimization for 500 epochs, with 331928 positive edges
18:14:25 Optimization finished
# 5- Draw the UMAP
DimPlot(atac_rna_guided, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
# 6- Recolor the RNA-guided imputed ATAC data with the RNA cluster
atac_rna_guided$orig_clusters <- atac_rna_guided$seurat_clusters
rna$orig_clusters <- rna$seurat_clusters
atac_rna_guided@active.ident<- rna$orig_clusters
DimPlot(atac_rna_guided, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
# 7- Create contingency table between both data
my_table_0 <- table(rna$seurat_clusters, atac_rna_guided$seurat_clusters)
rna_atac_impute_table<- prop.table(my_table_0, margin = 1)
This is the Second major step of scMoC after imputing the Data , we fine tune the clusters
splitting<-which(rna_atac_impute_table> 0.1 & rna_atac_impute_table<0.9,arr.ind = TRUE)
# add a col. for the new cluster information
splitting<-cbind(splitting,NA)
colnames(splitting)[3]<-'New_cluster'
rna@active.ident<-rna$orig_clusters
rna_tmp <- as.numeric(rna$orig_clusters)
atac_tmp <- as.numeric(atac_rna_guided$orig_clusters)
for (i in 1: nrow(splitting))
{
new_lvl <- max(as.numeric(rna$orig_clusters)) + i -1
rna@active.ident <- factor(rna@active.ident,levels = c(levels(rna@active.ident),new_lvl))
rna@active.ident[intersect(which(rna_tmp == splitting[i,1]),which(atac_tmp==splitting[i,2]))] <- as.factor(new_lvl)
splitting[i,3]<-new_lvl
}
#calc. the Distances to every cluster
dims <- 1:N_PCs
dist.matrix <-dist(x = Embeddings(object = rna[['pca']])[, dims], upper = TRUE, diag = TRUE)
tmp_dist <- as.matrix(dist.matrix)
cluster_dist <- matrix(0,ncol = max(as.numeric(rna@active.ident)),nrow = nrow(tmp_dist))
for (i in 1 : max(as.numeric(rna@active.ident)))
{
cluster_dist[,i] <- apply(tmp_dist[rna@active.ident==(i-1),],2,mean)
}
#itterate over the clusters
for (i in 1:max(rna_tmp))
{
# Check if the cluster has been split or not
tmp_splitting = splitting[which(splitting[,'row']==i),]
if (length(tmp_splitting)>3) # Has more than 1 reading
{
print(i)
tmp_cluster = apply(cluster_dist[rna@active.ident == (i-1),tmp_splitting[,3]],1,which.min)
final_cluster <- tmp_splitting[,3][tmp_cluster]
rna@active.ident[rna@active.ident == (i-1)] <- final_cluster
}else if(length(tmp_splitting)>0)
{
final_cluster<- tmp_splitting[3]
rna@active.ident[rna@active.ident == (i-1)] <- final_cluster
}
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 7
[1] 8
[1] 9
[1] 12
[1] 13
# Sort the new clusters according to cluster size
unsorted_table <- (table(rna@active.ident))
sorted_table<-sort(unsorted_table [unsorted_table!=0] , decreasing = T)
mapping_table <- rownames(as.matrix(sorted_table))
mapping_table<- cbind(mapping_table,0:(length(mapping_table)-1))
# Re-assign the cells to the new clusters
for (i in 1:length(rna@active.ident))
{
tmp_cell<-rna@active.ident[i]
rna@active.ident[i] <- mapping_table[which(mapping_table[,1] == tmp_cell),2]
}
#Setting the cluster numbers to the already used clusters only
rna@active.ident <- factor(rna@active.ident,levels = 0: (nrow(mapping_table)-1))
# Draw the UMAP with the scMoC clusters colors
DimPlot(rna, reduction = "umap",pt.size = 0.25, label = TRUE, label.size = 8) + NoLegend()
rna.markers3 <- FindAllMarkers(rna, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
Calculating cluster 0
Calculating cluster 1
Calculating cluster 2
Calculating cluster 3
Calculating cluster 4
Calculating cluster 5
Calculating cluster 6
Calculating cluster 7
Calculating cluster 8
Calculating cluster 9
Calculating cluster 10
Calculating cluster 11
Calculating cluster 12
Calculating cluster 13
Calculating cluster 14
Calculating cluster 15
Calculating cluster 16
Calculating cluster 17
Calculating cluster 18
Calculating cluster 19
Calculating cluster 20
Calculating cluster 21
Calculating cluster 22
Calculating cluster 23
Calculating cluster 24
rna_maker_genes3<-rna.markers3%>% group_by(cluster) %>% top_n(n = N_genes, wt = avg_logFC)
common_marker_genes_idx3<-intersect(marker_genes[,2],rna_maker_genes3$gene)
myplot = DotPlot(rna,features = common_marker_genes_idx3
,cols = c("gray", "blue")) +coord_flip()